This notebook gives headstart with keras library. Wine dataset is analysed, primarily on the lines of this blog post: https://www.datacamp.com/community/tutorials/deep-learning-python. Documented minimally, this quick and not so clean , just for later reference!
# import necessary libraries
import pandas as pd
from ggplot import *
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
# read data and summarize
baseurl = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/"
white = pd.read_csv(baseurl + "winequality-white.csv", sep = ';')
red = pd.read_csv(baseurl + "winequality-red.csv", sep = ';')
print(white.info())
# print(red.info())
# print(white.describe())
# print(red.describe())
# pairwise plot for 'white'
%matplotlib inline
sns.pairplot(white)
# pairwise plot for 'red'
%matplotlib inline
sns.pairplot(red)
# seems to show a predictable pattern for 'fixed acidity' versus 'density' as compared to white
# combine datasets and visualize
red['type'] = 1
white['type'] = 0
wines = red.append(white, ignore_index=True)
wines.info()
ggplot(aes(x='fixed acidity', y='density'), data = wines) +\
geom_point() +\
stat_smooth(color = 'red') +\
facet_wrap('type')
# look at correlation matrix
corr = wines.corr()
sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values)
sns.plt.show()
# test and train split
X = wines.ix[:,0:11]
y = np.ravel(wines.type)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)
# scale the data
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
# set up the keras model
model = Sequential() # Initialize keras model
model.add(Dense(12, activation = 'relu', input_shape=(11,))) # Add an input layer
model.add(Dense(8, activation = 'relu')) # Add one hidden layer
model.add(Dense(1, activation = 'sigmoid')) # Add an output layer
# look at model summary
model.summary() # Model summary
model.get_config() # Model config
model.get_weights() # List all weight tensors
# compile and fit
model.compile(loss = 'binary_crossentropy' # for two class classification
, optimizer = 'adam' # is a SGD variant
, metrics = ['accuracy'])
model.fit(X_train
, y_train
, epochs = 20 # seems to reach stability around this
, batch_size = 1 # send single row at a time
, verbose = 1
)
# predict
y_pred = np.round(model.predict(X_test))
print(y_pred)
# compare actual versus test to obtain 'loss' and 'accuracy'
score = model.evaluate(X_test, y_test,verbose = 1)
print("\n")
print(["loss", "accuracy"])
print(score)
# Confusion matrix
print("\nConfusion matrix")
print( confusion_matrix(y_test, y_pred) )
# Precision
print("\nPrecision")
print( precision_score(y_test, y_pred) )
0.994565217391
# Recall
print("\nRecall")
print( recall_score(y_test, y_pred) )
0.98563734290843807
# F1 score
print("\nF1 score")
print( f1_score(y_test,y_pred) )
0.99008115419296661
# Cohen's kappa
print("\nCohen's kappa")
print( cohen_kappa_score(y_test, y_pred) )
# lets see how it compares with randomforest
rf_model = RandomForestClassifier(n_estimators = 500 # number of trees
, verbose = 1
, oob_score = True
, random_state = 1
)
rf_model.fit(X_train, y_train)
print("\noob score")
print(rf_model.oob_score_)
print("\ntest score")
print(rf_model.score(X_test, y_test))
A lot of things can be done further, but that is for a different day!